A consumer bank with a range of products would like to cross-sell insurance to its consumer base (that is, cross-sell the personal protection insurance (PPI) product to those customers who have a secured or unsecured type of loan, but no PPI product as yet). Data is provided for their customer portfolio containing various fields about their product ownership, credit standing, outstanding amounts, and whether they already have an insurance product (called as PPI / personal protection insurance), if any, the type of PPI product they own.
The bank would like to adopt analytics driven approach applied on this sample data for deciding:
from IPython.core.display import display, HTML
display(HTML('<style>.container{width:90% !important;}</style>'))
## Import Packages
import pandas as pd, numpy as np, pprint, sklearn
import scipy.stats.stats as stats
import helperFunctions as hf
import plotly.express as px
from modelCompare import *
import networkx as nx
import collections
import warnings; warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import seaborn as sns
## Set Options
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 100)
# pd.set_option('display.max_colwidth', 400)
## Read data - excel file
data = pd.read_excel('Mortgage_Insurance_Data.xlsx', sheet_name = 'Data')
print(f'Shape of the dataframe: {data.shape}')
Shape of the dataframe: (16383, 59)
t = data.dtypes.reset_index()
t['Type'] = np.where(t.loc[:, 0].astype(str).isin(['int64', 'float64']), 'Numerical', 'Categorical')
t.groupby('Type').size()
Type Categorical 22 Numerical 37 dtype: int64
## Decriptive stats - numerical data
n = hf.custom_describe(data)
display(n.head(2))
| Count | Type | Mean | StandardDeviation | Minimum | Q1 | Median | Q3 | Maximum | IQR | Skewness | SkewnessComment | OutliersComment | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Ref | 16383 | int64 | 21168.70 | 12142.80 | 1.0 | 10699.0 | 21303.0 | 31689.0 | 42072.0 | 20990.0 | -0.02 | Fairly Symmetrical (Left) | NoOutliers |
| Credit_Score | 16383 | int64 | 882.06 | 93.15 | 352.0 | 818.0 | 882.0 | 947.0 | 1295.0 | 129.0 | -0.07 | Fairly Symmetrical (Right) | HasOutliers |
## Highly skewed distribution
hw = n.loc[n['SkewnessComment'].str.contains('Highly Skewed')].index
print(f'\n{len(hw)} variables with Highly Skewed Distribution:', list(hw))
## Moderately skewed distribution
ms = n.loc[n['SkewnessComment'].str.contains('Moderately Skewed')].index
print(f'\n{len(ms)} variables with Moderately Skewed Distribution:', list(ms))
## Fairly symmetrical distribution
fs = n.loc[n['SkewnessComment'].str.contains('Fairly Symmetrical')].index
print(f'\n{len(fs)} variables with Fairly Symmetrical Distribution:', list(fs))
## Negative minimum valued variables
nv = n.loc[n['Minimum']<0].index
print(f'\n{len(nv)} negative minimum valued variables:', list(nv))
## Variables with outliers
print('\n29 numerical columns have outliers and 8 do not have any outliers')
print('\nList of columns with outliers', list(n.loc[n['OutliersComment'] == 'HasOutliers'].index))
print('\nList of columns with NO outliers', list(n.loc[n['OutliersComment'] == 'NoOutliers'].index))
25 variables with Highly Skewed Distribution: ['Term', 'Net_Advance', 'Mosaic_Class', 'Time_at_Address', 'Number_of_Dependants', 'Time_in_Employment', 'Value_of_Property', 'Outstanding_Mortgage_Bal', 'Total_Outstanding_Balances', 'Bureau_Data___Monthly_Other_Co_R', 'Total_outstanding_balance__mortg', 'Total___Public_Info___CCJ____ban', 'Total_value__Public_Info___CCJ__', 'Time_since_most_recent_Public_In', 'Total_value__CAIS_8_9s', 'Worst_status_L6m', 'Worst_CUrrent_Status', '__of_status_3_s_L6m', 'Years_on_ER_for_SP', 'Total___outstanding_CCJ_s', 'Total_outstanding_balance___excl', 'Time_since_most_recent_outstandi', 'code', 'PPI_JOINT', 'PPI_LCI'] 4 variables with Moderately Skewed Distribution: ['APR', 'Income_Range', 'Time_with_Bank', 'Searches___Total___L6m'] 8 variables with Fairly Symmetrical Distribution: ['Ref', 'Credit_Score', 'Mosaic', 'Worst_History_CT', 'Age', 'Total___of_accounts', 'PPI', 'PPI_SINGLE'] 12 negative minimum valued variables: ['Age', 'Total_outstanding_balance__mortg', 'Total___Public_Info___CCJ____ban', 'Total_value__Public_Info___CCJ__', 'Time_since_most_recent_Public_In', 'Total_value__CAIS_8_9s', '__of_status_3_s_L6m', 'Years_on_ER_for_SP', 'Total___outstanding_CCJ_s', 'Total_outstanding_balance___excl', 'Total___of_accounts', 'Time_since_most_recent_outstandi'] 29 numerical columns have outliers and 8 do not have any outliers List of columns with outliers ['Credit_Score', 'Term', 'Net_Advance', 'Mosaic', 'Mosaic_Class', 'Time_at_Address', 'Number_of_Dependants', 'Time_in_Employment', 'Time_with_Bank', 'Value_of_Property', 'Outstanding_Mortgage_Bal', 'Total_Outstanding_Balances', 'Bureau_Data___Monthly_Other_Co_R', 'Age', 'Total_outstanding_balance__mortg', 'Total___Public_Info___CCJ____ban', 'Total_value__Public_Info___CCJ__', 'Time_since_most_recent_Public_In', 'Total_value__CAIS_8_9s', 'Worst_status_L6m', 'Worst_CUrrent_Status', '__of_status_3_s_L6m', 'Searches___Total___L6m', 'Years_on_ER_for_SP', 'Total___outstanding_CCJ_s', 'Total_outstanding_balance___excl', 'Time_since_most_recent_outstandi', 'code', 'PPI_JOINT', 'PPI_LCI'] List of columns with NO outliers ['Ref', 'APR', 'Income_Range', 'Worst_History_CT', 'Total___of_accounts', 'PPI', 'PPI_SINGLE']
## Analysis for categorical variables,
## 12v have 2 unique values, 3v have 3 unique values, 2v have 4 unique values
## and remaining had 5, 9, 10, 15 and 24 unique values
c = data.describe(include = ['object']).T.sort_values(by = 'unique')
display(c)
| count | unique | top | freq | |
|---|---|---|---|---|
| ACCESS_Card | 16383 | 2 | FALSE | 13028 |
| Loan_Type | 16383 | 2 | UnSecured | 13054 |
| Telephone_Indicator | 16383 | 2 | Y | 15241 |
| Gender | 16383 | 2 | M | 9770 |
| Full_Part_Time_Empl_Ind | 16383 | 2 | F | 16308 |
| Perm_Temp_Empl_Ind | 16383 | 2 | P | 16361 |
| Current_Account | 16383 | 2 | TRUE | 16351 |
| Other_Credit_Store_Card | 16383 | 2 | FALSE | 12976 |
| VISA_Card | 16383 | 2 | TRUE | 12087 |
| American_Express | 16383 | 2 | FALSE | 16227 |
| Diners_Card | 16383 | 2 | FALSE | 16198 |
| Cheque_Guarantee | 16383 | 2 | TRUE | 14659 |
| CIFAS_detected | 16383 | 3 | N | 16292 |
| Bankruptcy_Detected__SP_ | 16383 | 3 | N | 16351 |
| Payment_Method | 16383 | 3 | D | 16308 |
| category | 9461 | 4 | Single | 6264 |
| Marital_Status | 16383 | 4 | M | 9432 |
| Residential_Status | 16383 | 4 | H | 12572 |
| Employment_Status | 16383 | 9 | P | 10118 |
| Final_Grade | 16383 | 10 | A | 4538 |
| prdt_desc | 9345 | 15 | LASU | 3889 |
| Insurance_Description | 9461 | 24 | LASU | 2351 |
print('Unique values in categorical variables...')
display(c.reset_index().groupby('unique').size())
Unique values in categorical variables...
unique 2 12 3 3 4 3 9 1 10 1 15 1 24 1 dtype: int64
Including data cleaning, univariate, bivariate, multi-variate analysis. Findings from exploratory data analysis:
Insurance_Description, code, prdt_desc, category variables had ~42% missing values. Missing values in categorical variables were filled by "Missing" and in numerical variables filled by -1.Telephone_Indicator, Full_Part_Time_Empl_Ind, Perm_Temp_Empl_Ind, Current_Account, VISA_Card, American_Express, Diners_Card, Payment_Method, Bankruptcy_Detected__SP_, CIFAS_detected variables were dropped from the further analysis - had no impact in deciding whether a PPI is bought or not ## Are there any duplicated rows?
print('Is Ref a unique identifier?. Yes it looks like.')
print('Count and Count Distinct in Ref column: {}, {}'.format(data['Ref'].nunique(), data['Ref'].shape[0]))
print('Are there any other duplicates even if I leave the unique identifier? No.')
display(data[1:][data[1:].duplicated()].values)
Is Ref a unique identifier?. Yes it looks like. Count and Count Distinct in Ref column: 16383, 16383 Are there any other duplicates even if I leave the unique identifier? No.
array([], shape=(0, 59), dtype=object)
print('About 42 percent of the customer don\'t hold a PPI product and is our target population.')
_ = data.groupby('PPI').size().to_frame('counts').reset_index()
_['PPI'].replace({0: 'Don\'t Hold PPI', 1: 'Hold PPI'}, inplace = True)
fig = px.pie(_, values = 'counts', names = 'PPI', color = 'PPI', height = 400, width = 500,
#title = 'Distribution of PPI (Holding vs Non-holding Customers)',
color_discrete_map = {'Don\'t Hold PPI': '#F2B705', 'Hold PPI': '#D9CEC1'})
fig.show()
About 42 percent of the customer don't hold a PPI product and is our target population.
print('Identify columns with missing values in the dataframe and propotion of missing values')
pd.DataFrame([(data.isnull().sum()[data.isnull().sum()>0]),
(data.isnull().sum()[data.isnull().sum()>0]/len(data))],
index = ['Missing No', 'Missing Proportion']).T
Identify columns with missing values in the dataframe and propotion of missing values
| Missing No | Missing Proportion | |
|---|---|---|
| Insurance_Description | 6922.0 | 0.422511 |
| code | 6978.0 | 0.425929 |
| prdt_desc | 7038.0 | 0.429592 |
| category | 6922.0 | 0.422511 |
## fill missing values in these columns with "Missing" for categorical column and -1 for numerical column
misCatCols = ['Insurance_Description', 'category', 'prdt_desc']
data[misCatCols] = data[misCatCols].fillna('Missing')
data['code'] = data['code'].fillna(-1)
## Replace "JOINT" with "Joint" in "category" column
print('Replaced "JOINT" with "Joint" in the category column')
data['category'] = data['category'].replace('JOINT', 'Joint')
_ = data.groupby('category').size().to_frame('counts').reset_index()
fig = px.pie(_, values = 'counts', names = 'category', color = 'category',
title = 'Distribution of values in category variable', height = 600, width = 500,
color_discrete_map = {'Missing': '#B9FFFC', 'Single': '#A3D8F4', 'LCI': '#9AB3F5', 'Joint': '#7579E7'})
fig.show()
Replaced "JOINT" with "Joint" in the category column
## "Insurance_Description" standardization
data['Insurance_Description'] = data['Insurance_Description'].str.strip().replace({
'LIFE & CRITICAL ILLN': 'Life & Critical Illn',
'Life & CI': 'Life & Critical Illn',
'JOINT LIFE & CRITICA': 'Joint Life & Critica',
'1st Cust - LASCI / 2': '1st Cust LASCI / 2nd',
'1st Cust-LASCI /2nd': '1st Cust LASCI / 2nd',
'1st Cust - LASU / 2n' : '1st Cust LASU / 2nd',
'1st Cust-LASU/ 2nd C': '1st Cust LASU / 2nd'})
print('Almost similar columns: "category", "Insurance_Description", "prdt_desc"')
display(data.groupby(['category', 'Insurance_Description', 'prdt_desc']).size())
Almost similar columns: "category", "Insurance_Description", "prdt_desc"
category Insurance_Description prdt_desc
Joint 1st Cust LASCI / 2nd 1st Cust LASCI / 2nd Cust LASU 10
1st Cust LASU / 2nd 1st Cust LASU / 2nd Cust LASCI 8
JOINT LASCI JOINT LASCI 39
JOINT LASU JOINT LASU 159
Joint 1st Cust - LASCI / 2nd Cust - LASU 52
1st Cust - LASU / 2nd Cust - LASCI 44
1st Cust LASCI / 2nd Cust LASU 17
1st Cust LASU / 2nd Cust LASCI 10
JOINT LASCI 26
JOINT LASU 113
LASCI JOINT 123
LASU JOINT 604
Other 1st Cust LASCI / 2nd Cust LASU 2
1st Cust LASU / 2nd Cust LASCI 1
JOINT LASCI 4
JOINT LASU 13
LCI Life & Critical Illn LIFE & CRITICAL ILLNESS 1872
Other LIFE & CRITICAL ILLNESS 100
Missing Missing Missing 6922
Single 1st Cust LASCI / 2nd 1st Cust - LASCI / 2nd Cust - LCI 69
1st Cust LASU / 2nd 1st Cust - LASU / 2nd Cust - LCI 242
Bronze Missing 8
Foundation Missing 5
Gold Missing 63
Joint Life & Critica JOINT LIFE & CRITICAL ILLNESS 351
LASCI LASCI 717
LASU LASU 2351
Life & Critical Illn JOINT LIFE & CRITICAL ILLNESS 196
Other 1st Cust - LASCI / 2nd Cust - LCI 5
1st Cust - LASU / 2nd Cust - LCI 16
JOINT LIFE & CRITICAL ILLNESS 24
LASCI 66
LASU 262
Missing 2
Plus Missing 23
Select Missing 8
Silver Missing 7
Single 1st Cust - LASCI / 2nd Cust - LCI 55
1st Cust - LASU / 2nd Cust - L 2
1st Cust - LASU / 2nd Cust - LCI 129
LASCI 387
LASU 1276
dtype: int64
## Looking at "category" and "Insurance_Description"
_ = (data.groupby(['category', 'Insurance_Description']).size()
.sort_values(ascending = False).to_frame('count').reset_index())
fig = px.bar(_, x = 'category', y = 'count', color = 'Insurance_Description',
height = 600, width = 500, title = 'Category & Insurance Description Analysis')
fig.update_layout(template = 'simple_white')
fig.show()
## Analyzing Bankruptcy_Detected__SP_ variable
display(data['Bankruptcy_Detected__SP_'].value_counts().sort_values(ascending = False),
data['CIFAS_detected'].value_counts().sort_values(ascending = False))
N 16351 Y 24 T 8 Name: Bankruptcy_Detected__SP_, dtype: int64
N 16292 Y 83 T 8 Name: CIFAS_detected, dtype: int64
## Analyzing "Bankruptcy_Detected__SP_" & "CIFAS_detected"
print('For most customers, bankruptcy and CIFAS wasn\'t detected.')
print('However, for some bankruptcy wasn\'t detected but CIFAS was detected.')
print('I will not be considering customers with either Bankruptcy or CIFAS detected for cross-sell of PPI products.')
print('Also since I\'am not sure of what is "T" here I will drop that as well from the target population.')
print('Target population is customers currently not having a PPI product.')
data.groupby(['Bankruptcy_Detected__SP_', 'CIFAS_detected']).size()
## keeping only the ones without bankruptcy or CIFAS in the target population
data['Check'] = 'Drop'
data.loc[(data['Bankruptcy_Detected__SP_'] == 'N') &
(data['CIFAS_detected'] == 'N') &
(data['PPI'] == 0), 'Check'] = 'Keep'
data.loc[(data['PPI']==1), 'Check'] = 'Keep'
For most customers, bankruptcy and CIFAS wasn't detected. However, for some bankruptcy wasn't detected but CIFAS was detected. I will not be considering customers with either Bankruptcy or CIFAS detected for cross-sell of PPI products. Also since I'am not sure of what is "T" here I will drop that as well from the target population. Target population is customers currently not having a PPI product.
print('Number of customers with either a bankruptcy or a CIFAS detected (Y & T):',
data.loc[data['Check']=='Drop'].shape[0])
data = data.loc[(data['Check']=='Keep')]
print('Shape of dataframe after dropping customers with either a bankruptcy or a CIFAS detected (Y & T):', data.shape)
del data['Check']
Number of customers with either a bankruptcy or a CIFAS detected (Y & T): 37 Shape of dataframe after dropping customers with either a bankruptcy or a CIFAS detected (Y & T): (16346, 60)
## Checking unique values in categorical columns - Spell correct and space removal needed
cat_cols = data.select_dtypes(include = 'object').columns
for col in cat_cols:
print('{} column unique values: {}'.format(col, list(data[col].unique())))
## Removing spaces and spell correction in categorical columns
data[cat_cols] = data[cat_cols].apply(lambda x: x.str.strip().replace('FALS', 'FALSE'))
Final_Grade column unique values: ['A', 'X', 'B', 'D', 'E', 'C', 'R', 'F', 'O', 'S'] Loan_Type column unique values: ['UnSecured', 'secured '] Residential_Status column unique values: ['H', 'R', 'T', 'L'] Telephone_Indicator column unique values: ['Y', 'N'] Marital_Status column unique values: ['M', 'S', 'D', 'W'] Gender column unique values: ['M', 'F'] Employment_Status column unique values: ['S', 'R', 'P', 'G', 'D', 'M', 'H', 'U', 'T'] Full_Part_Time_Empl_Ind column unique values: ['F', 'P'] Perm_Temp_Empl_Ind column unique values: ['P', 'T'] Current_Account column unique values: ['TRUE', 'FALS'] ACCESS_Card column unique values: ['FALSE', 'TRUE '] VISA_Card column unique values: ['FALSE', 'TRUE '] American_Express column unique values: ['FALSE', 'TRUE '] Diners_Card column unique values: ['FALSE', 'TRUE '] Cheque_Guarantee column unique values: ['TRUE', 'FALS'] Other_Credit_Store_Card column unique values: ['TRUE ', 'FALSE '] Payment_Method column unique values: ['D', 'S', 'C'] Bankruptcy_Detected__SP_ column unique values: ['N', 'Y', 'T'] CIFAS_detected column unique values: ['N', 'Y', 'T'] Insurance_Description column unique values: ['Missing', 'Life & Critical Illn', 'Joint', 'Single', 'LASCI', 'LASU', 'Joint Life & Critica', '1st Cust LASU / 2nd', 'Gold', 'Bronze', 'Select', 'Silver', '1st Cust LASCI / 2nd', 'Other', 'Plus', 'JOINT LASU', 'Foundation', 'JOINT LASCI'] prdt_desc column unique values: ['Missing', 'LIFE & CRITICAL ILLNESS', 'LASCI JOINT', 'LASCI', 'LASU', 'LASU JOINT', 'JOINT LIFE & CRITICAL ILLNESS', '1st Cust - LASU / 2nd Cust - LCI', '1st Cust - LASU / 2nd Cust - LASCI', '1st Cust - LASCI / 2nd Cust - LCI', '1st Cust - LASCI / 2nd Cust - LASU', '1st Cust - LASU / 2nd Cust - L', 'JOINT LASU', '1st Cust LASU / 2nd Cust LASCI', '1st Cust LASCI / 2nd Cust LASU', 'JOINT LASCI'] category column unique values: ['Missing', 'LCI', 'Joint', 'Single']
## Handle high cardinality - leaving apart the 'Insurance_Description' and 'prdt_desc'
# Reference: https://medium.com/analytics-vidhya/how-handle-high-cardinality-d889ca61b6a6
print('Considering all below 5% in proportion as "Others" provided that the column has more than 3 values')
cardCols = list(data.select_dtypes(include='object').columns)
# cardCols.remove('Insurance_Description')
cardCols.remove('prdt_desc')
for col in cardCols:
counts = data[col].value_counts(normalize=True)
if (len(counts)) > 3:
l = list(counts.loc[counts<0.05].index)
data.loc[data[col].isin(l), col] = 'Others'
print(f'{col} - List of values replaced as "Others":', l)
_ = data.groupby(col).size().sort_values(ascending = False).to_frame('count').reset_index()
fig = px.bar(_, x = col, y = 'count', color = col, height = 600, width = 500,
title = 'Plot of Values in {}'.format(col))
fig.update_layout(template = 'simple_white')
fig.show()
Considering all below 5% in proportion as "Others" provided that the column has more than 3 values Final_Grade - List of values replaced as "Others": ['F', 'O', 'R', 'S']
Residential_Status - List of values replaced as "Others": ['L', 'R']
Marital_Status - List of values replaced as "Others": ['W']
Employment_Status - List of values replaced as "Others": ['R', 'D', 'M', 'U', 'H', 'T']
Insurance_Description - List of values replaced as "Others": ['LASCI', 'Other', 'Joint Life & Critica', '1st Cust LASU / 2nd', 'JOINT LASU', '1st Cust LASCI / 2nd', 'Gold', 'JOINT LASCI', 'Plus', 'Bronze', 'Select', 'Silver', 'Foundation']
category - List of values replaced as "Others": []
print('Final_Grade - A, X and B are contributing most for PPI products being bought by the customers')
_ = data.groupby(['PPI', 'Final_Grade']).size().to_frame('count').reset_index()
fig = px.bar(_, x = 'PPI', y = 'count', color = 'Final_Grade', height = 600, width = 500,
color_discrete_sequence=px.colors.qualitative.Bold)
fig.update_layout(template = 'simple_white')
fig.show()
Final_Grade - A, X and B are contributing most for PPI products being bought by the customers
print('Loan_Type - Customers preference is towards unsecured loans')
_ = data.groupby(['PPI', 'Loan_Type']).size().to_frame('count').reset_index()
fig = px.bar(_, x = 'PPI', y = 'count', color = 'Loan_Type', height = 600, width = 500,
color_discrete_sequence=px.colors.qualitative.Vivid)
fig.update_layout(template = 'simple_white')
fig.show()
Loan_Type - Customers preference is towards unsecured loans
print('Residential_Status - In comparison with customers buying a PPI vs not buying one ...')
pprint.pprint('the percentage of T residential status increases, while H decreases.')
_ = data.groupby(['PPI', 'Residential_Status']).size().to_frame('count').reset_index()
fig = px.bar(_, x = 'PPI', y = 'count', color = 'Residential_Status', height = 600, width = 500,
color_discrete_sequence=px.colors.qualitative.Vivid)
fig.update_layout(template = 'simple_white')
fig.show()
Residential_Status - In comparison with customers buying a PPI vs not buying one ... 'the percentage of T residential status increases, while H decreases.'
print('Telephone_Indicator - People buying PPI or not is weakly affected by whether or not they have a telephone.')
_ = (data.groupby(['PPI', 'Telephone_Indicator']).size()
.to_frame('count').reset_index()
.sort_values(by = 'count', ascending = False))
fig = px.bar(_, x = 'PPI', y = 'count', color = 'Telephone_Indicator', height = 600, width = 500,
color_discrete_sequence=px.colors.qualitative.Set2)
fig.update_layout(template = 'simple_white')
fig.show()
# data['Telephone_Indicator'].replace({'N':0, 'Y': 1}, inplace = True)
# print('Correlation between both the variables')
# display(data[['PPI', 'Telephone_Indicator']].corr())
## 'Telephone_Indicator' variable is dropped from the further analysis
del data['Telephone_Indicator']
Telephone_Indicator - People buying PPI or not is weakly affected by whether or not they have a telephone.
## Assuming D-Divorced, M-Married, S-Single, W-Widowed
print('Marital_Status - Proportion of Married people having a PPI is slightly lower than not having it')
pprint.pprint('whereas proportion of Single, Widowed, and Divorced people prefer having a PPI product.')
_ = data.groupby(['PPI', 'Marital_Status']).size().to_frame('count').reset_index()
fig = px.bar(_, x = 'PPI', y = 'count', color = 'Marital_Status', height = 600, width = 500,
color_discrete_sequence=px.colors.qualitative.Plotly)
fig.update_layout(template = 'simple_white')
fig.show()
Marital_Status - Proportion of Married people having a PPI is slightly lower than not having it
('whereas proportion of Single, Widowed, and Divorced people prefer having a '
'PPI product.')
print('Gender - Percentage of Female total customers having a PPI product is ~2% higher than total Female customers not having it.')
_ = data.groupby('Gender').size().to_frame('count').reset_index()
fig1 = px.pie(_, names = 'Gender', color = 'Gender', values = 'count', height = 600, width = 500,
color_discrete_sequence = px.colors.qualitative.D3)
_1 = data.groupby(['PPI', 'Gender']).size().to_frame('count').reset_index()
fig2 = px.bar(_1, x = 'PPI', y = 'count', color = 'Gender', height = 600, width = 500,
color_discrete_sequence=px.colors.qualitative.D3)
fig1.update_layout(template = 'simple_white')
fig2.update_layout(template = 'simple_white')
fig1.show()
fig2.show()
Gender - Percentage of Female total customers having a PPI product is ~2% higher than total Female customers not having it.
print('Employment Status - In a total customer with a PPI, the proportion of P increases by ~5%.')
pprint.pprint('The proportion of G makes up about 3% of the decrease in a total customer with a PPI versus not having a PPI.')
_ = data.groupby('Employment_Status').size().to_frame('count').reset_index()
fig1 = px.pie(_, names = 'Employment_Status', color = 'Employment_Status', values = 'count', height = 600, width = 500,
color_discrete_map = {'P': '#716F81', 'G': '#B97A95', 'S': '#F6AE99', 'Others': '#F2E1C1'})
_1 = data.groupby(['PPI', 'Employment_Status']).size().to_frame('count').reset_index()
fig2 = px.bar(_1, x = 'PPI', y = 'count', color = 'Employment_Status', height = 600, width = 500,
color_discrete_map = {'P': '#716F81', 'G': '#B97A95', 'S': '#F6AE99', 'Others': '#F2E1C1'})
fig1.update_layout(template = 'simple_white')
fig2.update_layout(template = 'simple_white')
fig1.show()
fig2.show()
Employment Status - In a total customer with a PPI, the proportion of P increases by ~5%.
('The proportion of G makes up about 3% of the decrease in a total customer '
'with a PPI versus not having a PPI.')
print('Full_Part_Time_Empl_Ind - Despite the fact that full-time employees represent a large portion of the data...')
pprint.pprint('they are of little significance when determining whether a PPI product can be targeted to them.')
_ = data.groupby(['PPI', 'Full_Part_Time_Empl_Ind']).size().to_frame('count').reset_index()
fig = px.bar(_, x = 'PPI', y = 'count', color = 'Full_Part_Time_Empl_Ind', height = 600, width = 500,
color_discrete_map = {'F': '#F1ECC3', 'P': '#57837B'})
fig.update_layout(template = 'simple_white')
fig.show()
# data['Full_Part_Time_Empl_Ind'].replace({'F':1, 'P':0}, inplace = True)
# data[['PPI', 'Full_Part_Time_Empl_Ind']].corr()
## dropping the 'Full_Part_Time_Empl_Ind' variable
del data['Full_Part_Time_Empl_Ind']
Full_Part_Time_Empl_Ind - Despite the fact that full-time employees represent a large portion of the data...
('they are of little significance when determining whether a PPI product can '
'be targeted to them.')
print('Perm_Temp_Empl_Ind - A weaker attribute towards determining...')
pprint.pprint('whether a PPI product can be targetted to people with permanent or temporary employment.')
_ = data.groupby(['PPI', 'Perm_Temp_Empl_Ind']).size().to_frame('count').reset_index()
fig = px.bar(_, x = 'PPI', y = 'count', color = 'Perm_Temp_Empl_Ind', height = 600, width = 500,
color_discrete_map = {'P': '#6E6D6D', 'T': '#FFAA71'})
fig.update_layout(template = 'simple_white')
fig.show()
# data['Perm_Temp_Empl_Ind'].replace({'P':1, 'T': 0}, inplace = True)
# data[['PPI', 'Perm_Temp_Empl_Ind']].corr()
## dropping the 'Full_Part_Time_Empl_Ind' variable
del data['Perm_Temp_Empl_Ind']
Perm_Temp_Empl_Ind - A weaker attribute towards determining...
('whether a PPI product can be targetted to people with permanent or temporary '
'employment.')
print('Current_Account - An insignificant variable in determining a PPI product to targeted to or not.')
_ = data.groupby(['PPI', 'Current_Account']).size().to_frame('count').reset_index()
fig = px.bar(_, x = 'PPI', y = 'count', color = 'Current_Account', height = 600, width = 500,
color_discrete_map = {'TRUE': '#EEB76B', 'FALSE': '#F1D6AB'})
fig.update_layout(template = 'simple_white')
fig.show()
## Dropping 'Current_Account' variable
del data['Current_Account']
Current_Account - An insignificant variable in determining a PPI product to targeted to or not.
print('ACCESS_Card - Percentage of customers having a PPI without access card is ~2% higher than not having a PPI.')
_ = data.groupby('ACCESS_Card').size().to_frame('count').reset_index()
fig1 = px.pie(_, names = 'ACCESS_Card', color = 'ACCESS_Card', values = 'count', height = 600, width = 500,
color_discrete_map = {'FALSE': '#E8E8E8', 'TRUE': '#495464'})
_1 = data.groupby(['PPI', 'ACCESS_Card']).size().to_frame('count').reset_index()
fig2 = px.bar(_1, x = 'PPI', y = 'count', color = 'ACCESS_Card', height = 600, width = 500,
color_discrete_map = {'FALSE': '#E8E8E8', 'TRUE': '#495464'})
fig1.update_layout(template = 'simple_white')
fig2.update_layout(template = 'simple_white')
fig1.show()
fig2.show()
ACCESS_Card - Percentage of customers having a PPI without access card is ~2% higher than not having a PPI.
print('VISA_Card - A 1% here and there against customers to be preferred with visa card or not - not that significant.')
_ = data.groupby(['PPI', 'VISA_Card']).size().to_frame('count').reset_index()
fig = px.bar(_, x = 'PPI', y = 'count', color = 'VISA_Card', height = 600, width = 500,
color_discrete_map = {'FALSE': '#A0C1B8', 'TRUE': '#719FB0'})
fig.update_layout(template = 'simple_white')
fig.show()
## Dropping 'VISA_Card' from the further analysis
del data['VISA_Card']
VISA_Card - A 1% here and there against customers to be preferred with visa card or not - not that significant.
print('American_Express - A ~1% here and there against customers to be preferred with American express card or not.')
_ = data.groupby(['PPI', 'American_Express']).size().to_frame('count').reset_index()
fig = px.bar(_, x = 'PPI', y = 'count', color = 'American_Express', height = 600, width = 500,
color_discrete_map = {'FALSE': '#6b705c', 'TRUE': '#a5a58d'})
fig.update_layout(template = 'simple_white')
fig.show()
## Dropping 'American_Express' from the further analysis
del data['American_Express']
American_Express - A ~1% here and there against customers to be preferred with American express card or not.
print('Diners_Card - A ~1% here and there against customers to be preferred with diners card or not.')
_ = data.groupby(['PPI', 'Diners_Card']).size().to_frame('count').reset_index()
fig = px.bar(_, x = 'PPI', y = 'count', color = 'Diners_Card', height = 600, width = 500,
color_discrete_map = {'FALSE': '#283618', 'TRUE': '#fefae0'})
fig.update_layout(template = 'simple_white')
fig.show()
## Dropping 'Diners_Card' from the further analysis
del data['Diners_Card']
Diners_Card - A ~1% here and there against customers to be preferred with diners card or not.
print('Cheque_Guarantee - Customers with a PPI hold a cheque guarantee about ~2% higher than the customers without a PPI.')
_ = data.groupby(['PPI', 'Cheque_Guarantee']).size().to_frame('count').reset_index()
fig = px.bar(_, x = 'PPI', y = 'count', color = 'Cheque_Guarantee', height = 600, width = 500,
color_discrete_map = {'FALSE': '#8ecae6', 'TRUE': '#219ebc'})
fig.update_layout(template = 'simple_white')
fig.show()
Cheque_Guarantee - Customers with a PPI hold a cheque guarantee about ~2% higher than the customers without a PPI.
print('Other_Credit_Store_Card - Almost a 2% difference in total number of customers holding other credit card store...')
pprint.pprint('when compared against customers having a PPI product or not having it.')
_ = data.groupby(['PPI', 'Other_Credit_Store_Card']).size().to_frame('count').reset_index()
fig = px.bar(_, x = 'PPI', y = 'count', color = 'Other_Credit_Store_Card', height = 600, width = 500,
color_discrete_map = {'FALSE': '#ffccd5', 'TRUE': '#ff4d6d'})
fig.update_layout(template = 'simple_white')
fig.show()
Other_Credit_Store_Card - Almost a 2% difference in total number of customers holding other credit card store... 'when compared against customers having a PPI product or not having it.'
print('Payment_Method - An insignificant variable in considering a PPI product cross-selling.')
_ = data.groupby(['PPI', 'Payment_Method']).size().to_frame('count').reset_index()
fig = px.bar(_, x = 'PPI', y = 'count', color = 'Payment_Method', height = 600, width = 500)
fig.update_layout(template = 'simple_white')
fig.show()
## Dropping 'Payment_Method' variable from further analysis
del data['Payment_Method']
Payment_Method - An insignificant variable in considering a PPI product cross-selling.
print('Bankruptcy_Detected__SP_ - Already played a role in the deciding a targeted population.')
## Dropping 'Bankruptcy_Detected__SP_' from further analysis -
del data['Bankruptcy_Detected__SP_']
Bankruptcy_Detected__SP_ - Already played a role in the deciding a targeted population.
print('CIFAS_detected - Already played a role in deciding target population')
## Dropping 'CIFAS_detected' from the further analysis
del data['CIFAS_detected']
CIFAS_detected - Already played a role in deciding target population
print('Insurance_Description - Missing for all customers part of target population.')
pprint.pprint('For all customers who are having a PPI, insurance description is always available.')
_ = data.groupby(['PPI', 'Insurance_Description']).size().to_frame('count').reset_index()
fig = px.bar(_, x = 'PPI', y = 'count', color = 'Insurance_Description', height = 600, width = 500)
fig.update_layout(template = 'simple_white')
fig.show()
Insurance_Description - Missing for all customers part of target population.
('For all customers who are having a PPI, insurance description is always '
'available.')
print('prdt_desc - Missing for all customers part of target population.')
pprint.pprint('Customers who are having a PPI, prdt_desc is available for ~98% of them.')
_ = data.groupby(['PPI', 'prdt_desc']).size().to_frame('count').reset_index()
fig = px.bar(_, x = 'PPI', y = 'count', color = 'prdt_desc', height = 600, width = 500)
fig.update_layout(template = 'simple_white')
fig.show()
prdt_desc - Missing for all customers part of target population. 'Customers who are having a PPI, prdt_desc is available for ~98% of them.'
print('category - Missing for all customers part of target population.')
pprint.pprint('For all customers who are having a PPI, category is always available.')
_ = data.groupby(['PPI', 'category']).size().to_frame('count').reset_index()
fig = px.bar(_, x = 'PPI', y = 'count', color = 'category', height = 600, width = 500)
fig.update_layout(template = 'simple_white')
fig.show()
category - Missing for all customers part of target population. 'For all customers who are having a PPI, category is always available.'
print('Category and Insurance Description for customers who have a PPI product and where Product Description is missing')
display(data.loc[(data['PPI'] == 1) & (data['prdt_desc'] == 'Missing')]
[['category', 'Insurance_Description', 'prdt_desc']].value_counts())
Category and Insurance Description for customers who have a PPI product and where Product Description is missing
category Insurance_Description prdt_desc Single Others Missing 116 dtype: int64
## Replace 'FALSE' with 0 and 'TRUE' with 1
cat_cols = list(data.select_dtypes(include='object').columns)
data[cat_cols] = data[cat_cols].apply(lambda x: x.replace({'FALSE':0, 'TRUE':1}))
le = sklearn.preprocessing.LabelEncoder()
encVal = {}
cat_cols = list(data.select_dtypes(include='object').columns)
for col in cat_cols:
print('{} column unique values: {}'.format(col, list(data[col].unique())))
data[col + '_le'] = le.fit_transform(data[col])
encVal[col] = list(data[[col, col + '_le']].drop_duplicates().set_index(col+'_le').to_dict().values())[0]
del data[col]
Final_Grade column unique values: ['A', 'X', 'B', 'D', 'E', 'C', 'Others'] Loan_Type column unique values: ['UnSecured', 'secured'] Residential_Status column unique values: ['H', 'Others', 'T'] Marital_Status column unique values: ['M', 'S', 'D', 'Others'] Gender column unique values: ['M', 'F'] Employment_Status column unique values: ['S', 'Others', 'P', 'G'] Insurance_Description column unique values: ['Missing', 'Life & Critical Illn', 'Joint', 'Single', 'Others', 'LASU'] prdt_desc column unique values: ['Missing', 'LIFE & CRITICAL ILLNESS', 'LASCI JOINT', 'LASCI', 'LASU', 'LASU JOINT', 'JOINT LIFE & CRITICAL ILLNESS', '1st Cust - LASU / 2nd Cust - LCI', '1st Cust - LASU / 2nd Cust - LASCI', '1st Cust - LASCI / 2nd Cust - LCI', '1st Cust - LASCI / 2nd Cust - LASU', '1st Cust - LASU / 2nd Cust - L', 'JOINT LASU', '1st Cust LASU / 2nd Cust LASCI', '1st Cust LASCI / 2nd Cust LASU', 'JOINT LASCI'] category column unique values: ['Missing', 'LCI', 'Joint', 'Single']
Feature reduction based on the pearson pairwise correlation matrix and then correlation of the variable with the PPI. Where there were multi-colinear features, their correlation with PPI variable was checked and the one with lower correlation was dropped from the analysis.
# Correlation matrix for all variables
def correlation_matrix(df, threshold = 0.8):
corr = df.corr()
mask = np.zeros_like(corr, dtype = np.bool)
mask[np.triu_indices_from(mask)] = True
f, ax = plt.subplots(figsize = (15, 7.2))
cmap = sns.diverging_palette(220, 10, as_cmap = True)
sns.heatmap(corr, mask = mask, cmap = cmap, square = True, linewidths = .5, cbar_kws = {'shrink': .5})#, annot = True)
ax.set_title('Correlation Matrix of Data')
# Filter for correlation value greater than threshold
sort = corr.abs().unstack()
sort = sort.sort_values(kind = 'quicksort', ascending = False)
display(sort[(sort > threshold) & (sort < 1)])
## List of columns
Cols = list(data.select_dtypes(include = np.number).columns)
Cols.remove('PPI')
Cols.remove('Ref')
print('List of columns except Ref & PPI::', Cols)
List of columns except Ref & PPI:: ['Credit_Score', 'Term', 'Net_Advance', 'APR', 'Mosaic', 'Mosaic_Class', 'Time_at_Address', 'Number_of_Dependants', 'Time_in_Employment', 'Income_Range', 'ACCESS_Card', 'Cheque_Guarantee', 'Other_Credit_Store_Card', 'Time_with_Bank', 'Value_of_Property', 'Outstanding_Mortgage_Bal', 'Total_Outstanding_Balances', 'Bureau_Data___Monthly_Other_Co_R', 'Worst_History_CT', 'Age', 'Total_outstanding_balance__mortg', 'Total___Public_Info___CCJ____ban', 'Total_value__Public_Info___CCJ__', 'Time_since_most_recent_Public_In', 'Total_value__CAIS_8_9s', 'Worst_status_L6m', 'Worst_CUrrent_Status', '__of_status_3_s_L6m', 'Searches___Total___L6m', 'Years_on_ER_for_SP', 'Total___outstanding_CCJ_s', 'Total_outstanding_balance___excl', 'Total___of_accounts', 'Time_since_most_recent_outstandi', 'code', 'PPI_SINGLE', 'PPI_JOINT', 'PPI_LCI', 'Final_Grade_le', 'Loan_Type_le', 'Residential_Status_le', 'Marital_Status_le', 'Gender_le', 'Employment_Status_le', 'Insurance_Description_le', 'prdt_desc_le', 'category_le']
# Correlation matrix for numerical variables
# correlation_matrix(data[Cols], threshold = 0.7) #threshold = 0.5
## Dropping variables - since these variables are correlated with other variables
## While deciding which variable to drop variable's correlation with 'PPI' was checked
## the one with the lower correlation is dropped
dropCols = ('Total_Outstanding_Balances', 'Time_since_most_recent_outstandi',
'Total___outstanding_CCJ_s', 'Bureau_Data___Monthly_Other_Co_R',
'Total___Public_Info___CCJ____ban', 'Net_Advance', 'Years_on_ER_for_SP',
'Outstanding_Mortgage_Bal', '__of_status_3_s_L6m', 'Worst_CUrrent_Status',
'Mosaic_Class', 'APR', 'category_le')
Cols = [e for e in Cols if e not in dropCols]
# data.drop(Cols, axis = 1, inplace = True)
# Correlation matrix for numerical variables
correlation_matrix(data[Cols], threshold = 0.7) #threshold = 0.5
Series([], dtype: float64)
# Absolute correlation of independent (numerical) variables with the target variable
absCorrwithDep = []
for var in Cols:
absCorrwithDep.append(abs(data['PPI'].corr(data[var])))
display(pd.DataFrame([Cols, absCorrwithDep], index = ['Variable', 'Correlation'])
.T.sort_values('Correlation', ascending = False).reset_index(drop=True).head())
| Variable | Correlation | |
|---|---|---|
| 0 | code | 0.991764 |
| 1 | PPI_SINGLE | 0.672413 |
| 2 | prdt_desc_le | 0.64947 |
| 3 | PPI_LCI | 0.315972 |
| 4 | PPI_JOINT | 0.242807 |
## Consider only the important columns
Cols.extend(['PPI', 'Ref'])
data = data[Cols]
According to www.listendata.com, Weight of Evidence is:
The weight of evidence tells the predictive power of an independent variable in relation to the dependent variable. Since it evolved from credit scoring world, it is generally described as a measure of the separation of good and bad customers. “Bad Customers” refers to the customers who defaulted on a loan. and “Good Customers” refers to the customers who paid back loan.
According to the same source, Information Value is:
Information value is one of the most useful technique to select important variables in a predictive model. It helps to rank variables on the basis of their importance.
| Information Value | Predictive Power |
|---|---|
| <0.02 | Useless for Prediction |
| 0.02-0.1 | Weak Predictor |
| 0.1-0.3 | Medium Predictor |
| 0.3-0.5 | Strong Predictor |
| >0.5 | Suspicious or too good to be true |
As mentioned here:
## For any negative values in the dataset, replace with median
## Assuming negative values were not meant to be...
l = []
for c in list(data.columns):
try:
if c != 'code' and min(data[c]) < 0:
l.append(c);
data.loc[data[c] < 0, c] = data[c].median()
except:pass
print('List of columns containing negative values, except code variable, include:', l)
List of columns containing negative values, except code variable, include: ['Age', 'Total_outstanding_balance__mortg', 'Total_value__Public_Info___CCJ__', 'Time_since_most_recent_Public_In', 'Total_value__CAIS_8_9s', 'Total_outstanding_balance___excl', 'Total___of_accounts']
def prep_dataset(data, target):
from xverse.feature_subset import SplitXY
clf = SplitXY([target]) #Split the dataset into X and y
X, y = clf.fit_transform(data) #returns features (X) dataset and target(Y) as a numpy array
return X, y
from xverse.transformer import WOE
## Adding interpretation to IV
def interpretation(iv):
if iv < 0.02:
return 'useless'
elif iv >= 0.02 and iv < 0.1:
return 'weak'
elif iv >= 0.1 and iv < 0.3:
return 'medium'
elif iv >= 0.3 and iv < 0.5:
return 'strong'
else:
return 'suspicious'
## prepare dataset - as required by the package
X, y = prep_dataset(data, target = 'PPI')
## Calculate WOE and IV for columns of the data dataframe
clf = WOE(mono_max_bins = 30)
## fit WOE
clf.fit(X, y)
## get clf transformed woe dataframe
# woedf = clf.transform(X)
## IV dataframe
IV = clf.iv_df
IV['Interpretation'] = IV['Information_Value'].map(interpretation)
display(IV)
| Variable_Name | Information_Value | Interpretation | |
|---|---|---|---|
| 34 | prdt_desc_le | 0.899262 | suspicious |
| 16 | PPI_SINGLE | 0.718341 | suspicious |
| 33 | code | 0.439489 | strong |
| 8 | Insurance_Description_le | 0.239598 | medium |
| 30 | Value_of_Property | 0.112956 | medium |
| 11 | Mosaic | 0.093838 | weak |
| 28 | Total_value__CAIS_8_9s | 0.077963 | weak |
| 20 | Term | 0.075806 | weak |
| 3 | Credit_Score | 0.067078 | weak |
| 18 | Residential_Status_le | 0.055238 | weak |
| 27 | Total_outstanding_balance__mortg | 0.053702 | weak |
| 29 | Total_value__Public_Info___CCJ__ | 0.049170 | weak |
| 15 | PPI_LCI | 0.048720 | weak |
| 25 | Total___of_accounts | 0.043687 | weak |
| 5 | Final_Grade_le | 0.040833 | weak |
| 21 | Time_at_Address | 0.029012 | weak |
| 7 | Income_Range | 0.022318 | weak |
| 1 | Age | 0.021806 | weak |
| 10 | Marital_Status_le | 0.020299 | weak |
| 4 | Employment_Status_le | 0.018839 | useless |
| 14 | PPI_JOINT | 0.017954 | useless |
| 32 | Worst_status_L6m | 0.016441 | useless |
| 12 | Number_of_Dependants | 0.011651 | useless |
| 24 | Time_with_Bank | 0.010608 | useless |
| 2 | Cheque_Guarantee | 0.008291 | useless |
| 26 | Total_outstanding_balance___excl | 0.006037 | useless |
| 0 | ACCESS_Card | 0.005012 | useless |
| 13 | Other_Credit_Store_Card | 0.003752 | useless |
| 22 | Time_in_Employment | 0.003408 | useless |
| 17 | Ref | 0.002769 | useless |
| 19 | Searches___Total___L6m | 0.002181 | useless |
| 6 | Gender_le | 0.001734 | useless |
| 23 | Time_since_most_recent_Public_In | 0.001429 | useless |
| 31 | Worst_History_CT | 0.000212 | useless |
| 9 | Loan_Type_le | 0.000046 | useless |
print('Based on the IV - Which are the strong, medium, weak attributes?')
ivSelCols = (list(IV.loc[IV['Interpretation'].isin(['strong', 'medium', 'weak']), 'Variable_Name']))
print(ivSelCols)
Based on the IV - Which are the strong, medium, weak attributes? ['code', 'Insurance_Description_le', 'Value_of_Property', 'Mosaic', 'Total_value__CAIS_8_9s', 'Term', 'Credit_Score', 'Residential_Status_le', 'Total_outstanding_balance__mortg', 'Total_value__Public_Info___CCJ__', 'PPI_LCI', 'Total___of_accounts', 'Final_Grade_le', 'Time_at_Address', 'Income_Range', 'Age', 'Marital_Status_le']
print('Considering only the suspicious, strong, medium, and weak variables based on the IV')
ivSelCols.extend(['PPI', 'Ref', 'Insurance_Description_le'])
data = data.loc[:, list(set(ivSelCols))]
print('Shape of the data after selecting featurers based on the Information Value:', data.shape)
Considering only the suspicious, strong, medium, and weak variables based on the IV Shape of the data after selecting featurers based on the Information Value: (16346, 19)
## Correlation of independent variables with PPI
corr = data.corrwith(data['PPI']).sort_values().drop('PPI')
fig = px.bar(corr, color_discrete_map = {'Variable': 'red'},
labels={'value': 'Correlation',
'index': 'Variable'},
title = 'Correlation of Independent with Dependent Variables')
fig.update_traces(hovertemplate = 'Variable: %{x} <br>Correlation: %{y}')
fig.update_layout(showlegend = False, template = 'simple_white')
fig.show()
## Getting the updated list of categorical columns and numerical columns
catCols = data.columns[data.columns.str.contains('_le', case=True)]
numCols = list(set(data.columns) - set(catCols))
numCols.remove('PPI')
numCols.remove('Ref')
## Get back the categorical value columns
for k, v in encVal.items():
try: data[k] = data[k + '_le'].replace(v)
except: pass
_ = data.groupby('Income_Range').size().to_frame('count').reset_index()
_['Percentage'] = np.round(_['count']/_['count'].sum(), 3)
_.sort_values(by = 'Percentage', ascending = False, inplace = True)
fig = px.bar(_, x = 'Income_Range', color = 'Income_Range', text = _['Percentage'],
y = 'count', height = 500, width = 800)
fig.update_layout(template = 'simple_white',
title = 'Income range = 6 accounted for 38% of all the income range available')
fig.show()
print('Income_Range - Customers at lower and higher income range don\'t prefer PPI products.')
_ = data.groupby(['PPI','Income_Range']).size().reset_index()
_['Percentage'] = (data.groupby(['PPI', 'Income_Range']).size()
.groupby(level = 0).apply(lambda x:100 * x / float(x.sum())).values)
_.columns = ['PPI', 'Income_Range', 'Counts', 'Percentage']
_['Percentage'] = _['Percentage'].map('{:,.2f}%'.format)
fig = px.bar(_, x = 'PPI', y = 'Counts', color = 'Income_Range',
barmode = 'stack', text = _['Percentage'], color_continuous_scale = 'teal')
fig.update_layout(title = 'Ideal income range for PPI is [1-5]', template = 'simple_white',
xaxis_title = 'PPI', yaxis_title = 'Counts', width = 600, height = 600)
fig.show()
Income_Range - Customers at lower and higher income range don't prefer PPI products.
_ = data.groupby('Total___of_accounts').size().to_frame('count').reset_index()
_['Percentage'] = np.round(_['count']/_['count'].sum()*100, 2)
_.sort_values(by = 'Percentage', ascending = False, inplace = True)
fig = px.bar(_, x = 'Total___of_accounts', color = 'Total___of_accounts', text = _['Percentage'],
y = 'count', height = 500, width = 800)
fig.update_layout(template = 'simple_white',
title = 'Customers with 9 number of accounts accounted for a whopping 30% of all customers')
fig.show()
print('Total___of_accounts - Contains negative number !!! Replacing negative with the median')
data.loc[data['Total___of_accounts'] < 0, 'Total___of_accounts'] = data['Total___of_accounts'].median()
print('Analysis of total number of accounts by PPI')
_ = data.groupby(['PPI','Total___of_accounts']).size().reset_index()
_['Percentage'] = (data.groupby(['PPI', 'Total___of_accounts']).size()
.groupby(level = 0).apply(lambda x:100 * x / float(x.sum())).values)
_.columns = ['PPI', 'Total___of_accounts', 'Counts', 'Percentage']
_['Percentage'] = _['Percentage'].map('{:,.2f}%'.format)
fig = px.bar(_, x = 'PPI', y = 'Counts', color = 'Total___of_accounts',
barmode = 'stack', text = _['Percentage'], color_continuous_scale = 'purp')
fig.update_layout(title = 'Ideal number of accounts range for PPI is [1-6]', template = 'simple_white',
xaxis_title = 'PPI', yaxis_title = 'Counts', width = 600, height = 600)
fig.show()
Total___of_accounts - Contains negative number !!! Replacing negative with the median Analysis of total number of accounts by PPI
print('Analysis of Total of Accounts, Insurance Description and PPI based on Average Number of Accounts')
fig = px.histogram(data, y = 'Insurance_Description', x = 'Total___of_accounts', color = 'PPI', histfunc = 'avg',
color_discrete_sequence = px.colors.qualitative.Dark24)
fig.update_layout(title = 'Average of total accounts for customers holding PPI was highest in Joint insurance product',
template = 'simple_white', xaxis_title = 'category', yaxis_title = 'Count of Total___of_accounts',
width = 800, height = 500)
fig.show()
Analysis of Total of Accounts, Insurance Description and PPI based on Average Number of Accounts
print('Analysis of Total of Accounts, Category and PPI based on counts')
fig = px.histogram(data, x = 'Insurance_Description', y = 'Total___of_accounts',
color = 'Total___of_accounts', histfunc = 'count', facet_col = 'PPI')
fig.update_layout(title = 'Customers with total of accounts = 9 are to be preferred for offering PPI',
template = 'simple_white', xaxis_title = 'Insurance_Description', yaxis_title = 'Count of Total___of_accounts',
width = 800, height = 600)
fig.show()
Analysis of Total of Accounts, Category and PPI based on counts
print('Age - Contains negative number !!! Replacing negative with the median')
data.loc[data['Age'] < 0, 'Age'] = data['Age'].median()
## Binning age variable
data['Age_bin'] = pd.qcut(data['Age'], q = 6, precision = 0).apply(lambda x: str(x.left) + '-' + str(x.right))
Age - Contains negative number !!! Replacing negative with the median
## Analysis of Age_bin & PPI
_ = data.groupby(['PPI', 'Age_bin']).size().to_frame('count').reset_index()
_['Percentage'] = (data.groupby(['PPI', 'Age_bin']).size()
.groupby(level = 0).apply(lambda x: np.round(100 * x / float(x.sum()), 2)).values)
fig = px.bar(_, x = 'PPI', y = 'count', color = 'Age_bin', barmode = 'group', text = _['Percentage'])
fig.update_traces(texttemplate = '%{text:.2s}', textposition = 'inside')
fig.update_layout(uniformtext_minsize = 8, uniformtext_mode = 'hide', template = 'simple_white', width = 800, height = 500,
title = 'Age group 33-39 & 48-55 hold ~37% of PPI products and age group 18-44 can be targeted the most for selling PPI products',
font = dict(size = 9))
fig.show()
_ = data.groupby(['PPI', 'Mosaic']).size().to_frame('count').reset_index()
_['Percentage'] = (data.groupby(['PPI', 'Mosaic']).size()
.groupby(level = 0).apply(lambda x: np.round(100 * x / float(x.sum()), 2)).values)
fig = px.bar(_, x = 'Mosaic', color = 'PPI', y = 'count', height = 500, width = 1000,
template = 'simple_white', facet_col = 'PPI', text = _['Percentage'])
fig.update_layout(title = 'Mosaic = 0 account for 8.5% of customers not holding a PPI product')
fig.show()
fig = px.scatter(_, x = 'Mosaic', y = 'count', size = 'count', color = 'PPI', marginal_y = 'violin',
marginal_x = 'box', trendline = 'ols', template = 'simple_white')
fig.update_layout(title = 'Analysis of Mosaic & PPI - Outliers exists')
fig.show()
## Binning term variable
data['Term_bin'] = pd.cut(data['Term'], bins = 20, precision = 0).apply(lambda x: str(x.left) + '-' + str(x.right))
_ = data.groupby(['PPI', 'Term_bin']).size().to_frame('count').reset_index()
_['Percentage'] = (data.groupby(['PPI', 'Term_bin']).size()
.groupby(level = 0).apply(lambda x: np.round(100 * x / float(x.sum()), 2)).values)
fig = px.bar(_, x = 'PPI', y = 'count', color = 'Term_bin', barmode = 'group',
text = _['Percentage'], template = 'simple_white',
color_discrete_sequence = px.colors.qualitative.Prism,
title = 'About 50% of the terms were of 50-65 months irrespective of the PPI flag')
fig.show()
## Binning Total_outstanding_balance__mortg variable and considering -ve balance as 0
data.loc[data['Total_outstanding_balance__mortg'] < 0, 'Total_outstanding_balance__mortg'] = 0
data['Total_outstanding_balance__mortg_bin'] = (pd.cut(data['Total_outstanding_balance__mortg'],
bins = 10, precision = 3)
.apply(lambda x: str(x.left) + '-' + str(x.right)))
_ = data.groupby(['PPI', 'Total_outstanding_balance__mortg_bin']).size().to_frame('count').reset_index()
_['Percentage'] = (data.groupby(['PPI', 'Total_outstanding_balance__mortg_bin']).size()
.groupby(level = 0).apply(lambda x: np.round(100 * x / float(x.sum()), 2)).values)
fig = px.bar(_, x = 'PPI', y = 'count', color = 'Total_outstanding_balance__mortg_bin', barmode = 'group',
text = _['Percentage'], template = 'simple_white',
color_discrete_sequence = px.colors.qualitative.Pastel2,
title = 'Close to 70% of the customers had 0-99 outstanding mortgage balance irrespective of the PPI flag')
fig.show()
## Binning Time_at_Address variable
data['Time_at_Address_bin'] = (pd.cut(data['Time_at_Address'], bins = 10, precision = 0)
.apply(lambda x: str(x.left) + '-' + str(x.right)))
_ = data.groupby(['PPI', 'Time_at_Address_bin']).size().to_frame('count').reset_index()
_['Percentage'] = (data.groupby(['PPI', 'Time_at_Address_bin']).size()
.groupby(level = 0).apply(lambda x: np.round(100 * x / float(x.sum()), 2)).values)
fig = px.bar(_, x = 'PPI', y = 'count', color = 'Time_at_Address_bin', barmode = 'group',
text = _['Percentage'], template = 'simple_white',
color_discrete_sequence = px.colors.qualitative.Plotly,
title = 'Customers staying at the address between 0 and 260 days can be prioritized for cross-selling PPI products')
fig.show()
## Binning Credit_Score variable
data['Credit_Score_bin'] = (pd.cut(data['Credit_Score'], bins = 15, precision = 0)
.apply(lambda x: str(x.left) + '-' + str(x.right)))
_ = data.groupby(['PPI', 'Credit_Score_bin']).size().to_frame('count').reset_index()
_['Percentage'] = (data.groupby(['PPI', 'Credit_Score_bin']).size()
.groupby(level = 0).apply(lambda x: np.round(100 * x / float(x.sum()), 2)).values)
fig = px.bar(_, x = 'PPI', y = 'count', color = 'Credit_Score_bin', barmode = 'group',
text = _['Percentage'], template = 'simple_white',
color_discrete_sequence = px.colors.qualitative.Vivid,
title = 'PPI products are more likely to be sold to customers with a credit score between 792 and 918')
fig.show()
_ = data.groupby(['PPI', 'Credit_Score']).size().to_frame('count').reset_index()
_['Percentage'] = (data.groupby(['PPI', 'Credit_Score']).size()
.groupby(level = 0).apply(lambda x: np.round(100 * x / float(x.sum()), 2)).values)
fig = px.scatter(_, x = 'Credit_Score', y = 'count', size = 'count', color = 'PPI', marginal_y = 'violin',
marginal_x = 'box', trendline = 'ols', template = 'simple_white', facet_col = 'PPI')
fig.update_layout(title = 'Analysis of Credit Score & PPI - Outliers exists')
fig.show()
## Binning Value_of_Property variable
data['Value_of_Property_bin'] = (pd.qcut(data['Value_of_Property'], q = 5, precision = 0)
.apply(lambda x: str(x.left) + '-' + str(x.right)))
_ = data.groupby(['PPI', 'Value_of_Property_bin']).size().to_frame('count').reset_index()
_['Percentage'] = (data.groupby(['PPI', 'Value_of_Property_bin']).size()
.groupby(level = 0).apply(lambda x: np.round(100 * x / float(x.sum()), 2)).values)
fig = px.bar(_, x = 'PPI', y = 'count', color = 'Value_of_Property_bin', barmode = 'group',
text = _['Percentage'], template = 'simple_white',
color_discrete_sequence = px.colors.qualitative.Plotly,
title = 'PPI products can be targeted to customers with a value of property b/w 115k and 120k, accounts for >30%')
fig.show()
Steps performed include:
Insurance_Description.Insurance_Description with independent features in the dataframeInsurance_Description being the target variableInsurance_Description using CatBoost Classifier with a recall and F1-score of 87%dataModel = data.copy(deep = True)
## Drop binned variables and label encoded variables
dropCols = list(dataModel.columns[dataModel.columns.str.contains('_bin')])
dropCols.extend(list(dataModel.select_dtypes('object').columns))
dropCols.extend(['PPI']) ##'prdt_desc_le'
print('List of columns to be dropped from dataModel dataframe:', dropCols)
## Drop above columns
dataModel.drop(dropCols, axis = 1, inplace = True)
print('Shape of dataframe after dropping binned and other variables:', dataModel.shape)
print('\n\nColumns in dataModel dataframe:', list(dataModel.columns))
List of columns to be dropped from dataModel dataframe: ['Age_bin', 'Term_bin', 'Total_outstanding_balance__mortg_bin', 'Time_at_Address_bin', 'Credit_Score_bin', 'Value_of_Property_bin', 'Final_Grade', 'Residential_Status', 'Marital_Status', 'Insurance_Description', 'PPI'] Shape of dataframe after dropping binned and other variables: (16346, 18) Columns in dataModel dataframe: ['Value_of_Property', 'Age', 'Term', 'Insurance_Description_le', 'Marital_Status_le', 'code', 'Final_Grade_le', 'Time_at_Address', 'Mosaic', 'Total_value__CAIS_8_9s', 'Total_outstanding_balance__mortg', 'Ref', 'Income_Range', 'Total___of_accounts', 'Total_value__Public_Info___CCJ__', 'PPI_LCI', 'Credit_Score', 'Residential_Status_le']
## Correlation of independent variables with category_le
target = 'Insurance_Description_le'
corr = dataModel.corrwith(dataModel[target]).sort_values().drop([target, 'Ref'])
fig = px.bar(corr, color_discrete_map = {'Variable': 'purple'},
labels={'value': 'Correlation',
'index': 'Variable'},
title = 'Correlation of Independent with Dependent Variables')
fig.update_traces(hovertemplate = 'Variable: %{x} <br>Correlation: %{y}')
fig.update_layout(showlegend = False, template = 'simple_white')
fig.show()
print('Checking what\'s was encoded as what in the Insurance_Description variable::', encVal['Insurance_Description'])
print('Checking what\'s was encoded as what in the Residential_Status variable::', encVal['Residential_Status'])
print('Checking what\'s was encoded as what in the category variable::', encVal['category'])
Checking what's was encoded as what in the Insurance_Description variable:: {3: 'Missing', 2: 'Life & Critical Illn', 0: 'Joint', 5: 'Single', 4: 'Others', 1: 'LASU'}
Checking what's was encoded as what in the Residential_Status variable:: {0: 'H', 1: 'Others', 2: 'T'}
Checking what's was encoded as what in the category variable:: {2: 'Missing', 1: 'LCI', 0: 'Joint', 3: 'Single'}
## in the label encoded Insurance_Description variable
miss = 3 ## since 3 = Missing
train = dataModel.loc[dataModel[target] != miss]
print('Shape of training data:', train.shape)
test = dataModel.loc[dataModel[target] == miss]
referenceIDs = list(test['Ref'])
del test[target]
print('Shape of validation data:', test.shape)
Shape of training data: (9461, 18) Shape of validation data: (6885, 17)
## Independent and dependent variables from the training data
X = train.drop([target, 'Ref'], axis = 1)
y = train[target]
## Creating train and validation set from training data - Stratifying the target variable
sc = sklearn.preprocessing.RobustScaler()
X = sc.fit_transform(X)
X_test = sc.transform(test.drop(['Ref'], axis = 1).values)
X_tr, X_val, y_tr, y_val = sklearn.model_selection.train_test_split(X, y, random_state = 10, stratify = y)
print('Shape of independent training and validation features is', X_tr.shape, 'and', X_val.shape, 'respectively')
Shape of independent training and validation features is (7095, 16) and (2366, 16) respectively
## Verifying whether stratify worked or not
display(y_tr.value_counts(normalize = True), y_val.value_counts(normalize = True))
1 0.248485 4 0.232981 2 0.218605 5 0.195349 0 0.104581 Name: Insurance_Description_le, dtype: float64
1 0.248521 4 0.232883 2 0.218512 5 0.195689 0 0.104396 Name: Insurance_Description_le, dtype: float64
# Run models_evaluation for classifier models
models_evaluation(X_tr, y_tr, 5)
| Logistic Regression | Support Vector Classifier | Decision Tree | Random Forest | Gaussian Naive Bayes | KNearest Neighbor | XG Boost | AdaBoost | Gradient Boosting | Cat Boost | Best Score | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| Accuracy | 0.589711 | 0.558562 | 0.911910 | 0.923890 | 0.649753 | 0.497815 | 0.946441 | 0.522763 | 0.950810 | 0.947146 | Gradient Boosting |
| Precision | 0.565169 | 0.532896 | 0.913367 | 0.924257 | 0.684137 | 0.534952 | 0.946964 | 0.543253 | 0.951875 | 0.947929 | Gradient Boosting |
| Recall | 0.589711 | 0.558562 | 0.911910 | 0.923890 | 0.649753 | 0.497815 | 0.946441 | 0.522763 | 0.950810 | 0.947146 | Gradient Boosting |
| F1 Score | 0.562858 | 0.518771 | 0.912375 | 0.922318 | 0.572572 | 0.507929 | 0.945394 | 0.446340 | 0.949826 | 0.946000 | Gradient Boosting |
## CatBoost Classifier - on the training and validation set
cb_model = CatBoostClassifier(random_state = 2021, verbose = False) ##, criterion = 'entropy' in case of RFC
cb_model.fit(X_tr, y_tr)
y_pred = cb_model.predict(X_val)
print('Weighted average recall and F1-score 94% on the validation set')
print(sklearn.metrics.classification_report(y_val, y_pred))
Weighted average recall and F1-score 94% on the validation set
precision recall f1-score support
0 0.98 0.91 0.95 247
1 0.88 0.99 0.93 588
2 0.95 1.00 0.97 517
4 0.95 0.79 0.87 551
5 0.99 1.00 1.00 463
accuracy 0.94 2366
macro avg 0.95 0.94 0.94 2366
weighted avg 0.94 0.94 0.94 2366
## CatBoost Classifier - on the training and test set
cb_model = CatBoostClassifier(random_state = 2021, verbose = False)
cb_model.fit(X, y)
y_pred = np.array(cb_model.predict(X_test)).ravel()
print('Value counts in the predicted category')
display(pd.Series(y_pred).value_counts())
Value counts in the predicted category
2 4824 4 2048 5 13 dtype: int64
## Concat the predicted category in the test dataframe
referenceIDs = pd.DataFrame(referenceIDs).reset_index(drop=True)
referenceIDs[target] = pd.Series(y_pred)
referenceIDs.columns = ['Ref', target]
test = test.merge(referenceIDs, on = 'Ref', how = 'left')
## Append both train and test dataframe to update the dataModel dataframe
dataModel = pd.concat([train, test])
## Random shuffle the dataModel dataframe
dataModel = dataModel.sample(frac = 1)
## Shape of the dataModel dataframe::
print('Shape of dataModel dataframe', dataModel.shape)
## Value count of 'category_le' variable
display(dataModel[target].value_counts(dropna = False))
Shape of dataModel dataframe (16346, 18)
2 6892 4 4252 1 2351 5 1862 0 989 Name: Insurance_Description_le, dtype: int64
Assuming, predicted products (Insurance_Description) are the products customers currently holding, using market basket analysis to explore the products customers tend to buy together, and use that information to cross-sell other insurance products. Association Rule Mining is used when we want to find an association between different objects in a set, find frequent patterns in a transaction database, relational databases or any other information repository. Some of the applications of market basket analysis include: recommendation engine, cross-sell / bundle products, arranging items in the retail stores, credit card purchases of customers to build profiles for fraud detection purposes and cross-selling opportunities, telecom marketing efforts at customers, etc.
Cross-selling means encouraging a customer who buy product to buy a related or complementary product, with a view to expand banking business, reduce the per customer cost of operation and provide more satisfaction and value to the customer.
Apriori Algorithm & Matrices
Confidence: Likelihood that customer who bought both A and B. It is the ratio of the number of transactions involving both A and B and the number of transactions involving B.
Lift: Increase in the sale of A when you sell B.
“Frequently Bought Together” → Association
“Customers who bought this item also bought” → Recommendation
Source: KD Nuggets & Research Paper
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
## bringing back the categorical columns - no need of label encoded values now
for k, v in encVal.items():
try:
dataModel[k] = dataModel[k + '_le'].replace(v).astype('category')
del dataModel[k + '_le']
except: pass
## Reference ID now is useless column
del dataModel['Ref']
_ = dataModel.nunique().reset_index().merge(dataModel.dtypes.reset_index(), on = 'index', how = 'left')
_.columns = ['Variable', 'Distinct Values', 'DTypes']
_ = _.sort_values(by = 'Distinct Values', ascending = False)
_['DTypes'] = _['DTypes'].astype(str)
fig = px.bar(_, x = 'Variable', y = 'Distinct Values', color = 'DTypes',
template = 'simple_white', color_discrete_sequence = px.colors.qualitative.Vivid,
labels = {'sum of Distinct Values': 'Distinct Values'}, text = _['Distinct Values'])
fig.show()
## Helper function to encode anything greater than 1 with 1
against_what = 'Income_Range'
def encode(x):
if x <= 0: return 0
if x >= 1: return 1
## Creating a basket of income range
basket = pd.crosstab(dataModel[against_what], dataModel['Insurance_Description'])
basket = basket.applymap(encode)
## Creating frequent sets and rules - setting minimum threshold as 5%
frequent_items = apriori(basket, min_support = 0.05, use_colnames = True, low_memory = True)
## Convert into rules - metric = confidence and minimum threshold = 20%
rules = association_rules(frequent_items, metric = 'confidence', min_threshold = 0.2)
rules = rules.sort_values('lift', ascending = False).reset_index(drop = True)
# Convert antecedents and consequents into strings
rules['antecedents'] = rules['antecedents'].apply(lambda a: ', '.join(list(a)))
rules['consequents'] = rules['consequents'].apply(lambda a: ', '.join(list(a)))
## Counting number of antecedents and consequents
rules['Number of Antecedents'] = (rules['antecedents'].str.count(',')) + 1
rules['Number of Consequents'] = (rules['consequents'].str.count(',')) + 1
print('Number of association rules:', len(rules))
display(rules.head())
Number of association rules: 180
| antecedents | consequents | antecedent support | consequent support | support | confidence | lift | leverage | conviction | Number of Antecedents | Number of Consequents | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | LASU | Joint | 0.857143 | 0.857143 | 0.857143 | 1.0 | 1.166667 | 0.122449 | inf | 1 | 1 |
| 1 | Joint | Others, Life & Critical Illn, LASU | 0.857143 | 0.857143 | 0.857143 | 1.0 | 1.166667 | 0.122449 | inf | 1 | 3 |
| 2 | LASU | Others, Joint | 0.857143 | 0.857143 | 0.857143 | 1.0 | 1.166667 | 0.122449 | inf | 1 | 2 |
| 3 | Joint | Others, LASU | 0.857143 | 0.857143 | 0.857143 | 1.0 | 1.166667 | 0.122449 | inf | 1 | 2 |
| 4 | LASU, Single | Joint | 0.857143 | 0.857143 | 0.857143 | 1.0 | 1.166667 | 0.122449 | inf | 2 | 1 |
## Considering lift > 1 - Lift greater than 1 vouches for high association between {Y} and {X}.
## More the value of lift, greater are the chances of preference to buy {Y} if the customer has already bought {X}
rules = rules.loc[rules['lift'] > 1]
## Possible type of combinations existing in data
possible_strategies = (rules[['Number of Antecedents', 'Number of Consequents']]
.drop_duplicates().reset_index(drop = True))
possible_strategies = (possible_strategies.groupby(['Number of Antecedents', 'Number of Consequents'])
.size().to_frame('Count').reset_index())
text = ('# of Antecedents = ' + possible_strategies['Number of Antecedents'].astype(str) + '<br>' +
'# of Consequents = ' + possible_strategies['Number of Consequents'].astype(str))
fig = px.bar(possible_strategies, x = 'Number of Antecedents', y = 'Number of Consequents',
barmode = 'group', color = 'Count', template = 'simple_white',
title = 'Number of Antecedents and Consequents', text = text,
height = 600, width = 900)
fig.update_layout(yaxis_visible = True, yaxis_showticklabels = False)
fig.update(layout_coloraxis_showscale = False)
fig.show()
# Transform antecedent, consequent, and support columns into matrix
support_table = rules.pivot(index = 'consequents', columns = 'antecedents', values = 'support')
fig = px.imshow(support_table, template = 'simple_white',
height = 800, width = 800,
title = 'Heatmap of Antecedents & Consequents, by Support')
fig.show()
## a minimum of 2 consequents - updating the dataframe
rules = rules.loc[(rules['Number of Consequents'] >= 2)]
## Possible strategies with a minimum of 2 consequents
print('14 possible combos possible with 1 antecedent; 18 with 2 antecedents and 6 with 3 antecedents')
display(rules['Number of Antecedents'].value_counts().sort_index())
14 possible combos possible with 1 antecedent; 18 with 2 antecedents and 6 with 3 antecedents
1 14 2 18 3 6 Name: Number of Antecedents, dtype: int64
## 1 antecedent and >=2 consequents --------------
a = 1
ante = rules.loc[rules['Number of Antecedents'] == a]
print('Antecedents being bought:', collections.Counter(list(ante['antecedents'])))
l = list(ante['consequents'])
conse = collections.Counter(l)
print('Consequents being bought:', conse)
#display(ante)
## NetworkX graph
fig = plt.figure(figsize = (10, 10))
G = nx.from_pandas_edgelist(ante, 'antecedents', 'consequents')
nx.draw(G, with_labels = True, node_color = 'skyblue', pos = nx.spring_layout(G), node_size = 50)
plt.show()
Antecedents being bought: Counter({'Joint': 7, 'LASU': 7})
Consequents being bought: Counter({'Others, Life & Critical Illn, LASU': 1, 'Others, Joint': 1, 'Others, LASU': 1, 'Single, Joint': 1, 'LASU, Single': 1, 'Others, Single, Joint': 1, 'Others, LASU, Single': 1, 'Others, Life & Critical Illn, Joint': 1, 'Others, Life & Critical Illn, LASU, Single': 1, 'Others, Life & Critical Illn, Single, Joint': 1, 'Life & Critical Illn, Single, Joint': 1, 'Life & Critical Illn, LASU, Single': 1, 'Life & Critical Illn, Joint': 1, 'Life & Critical Illn, LASU': 1})
## 2 antecedent and >=2 consequents --------------
a = 2
ante = rules.loc[rules['Number of Antecedents'] == a]
print('Antecedents being bought:', collections.Counter(list(ante['antecedents'])))
l = list(ante['consequents'])
conse = collections.Counter(l)
print('Consequents being bought:', conse)
#display(ante)
## NetworkX graph
fig = plt.figure(figsize = (10, 10))
G = nx.from_pandas_edgelist(ante, 'antecedents', 'consequents')
nx.draw(G, with_labels = True, node_color = 'skyblue', pos = nx.spring_layout(G), node_size = 50)
plt.show()
Antecedents being bought: Counter({'Life & Critical Illn, Joint': 3, 'Life & Critical Illn, LASU': 3, 'Others, Joint': 3, 'Others, LASU': 3, 'Single, Joint': 3, 'LASU, Single': 3})
Consequents being bought: Counter({'Others, LASU': 2, 'Others, Joint': 2, 'Life & Critical Illn, LASU': 2, 'Life & Critical Illn, Joint': 2, 'Single, Joint': 2, 'LASU, Single': 2, 'Others, Life & Critical Illn, Joint': 1, 'Life & Critical Illn, Single, Joint': 1, 'Others, Single, Joint': 1, 'Others, Life & Critical Illn, LASU': 1, 'Life & Critical Illn, LASU, Single': 1, 'Others, LASU, Single': 1})
## 3 antecedent and >=2 consequents --------------
a = 3
ante = rules.loc[rules['Number of Antecedents'] == a]
print('Antecedents being bought:', collections.Counter(list(ante['antecedents'])))
l = list(ante['consequents'])
conse = collections.Counter(l)
print('Consequents being bought:', conse)
#display(ante)
## NetworkX graph
fig = plt.figure(figsize = (10, 10))
G = nx.from_pandas_edgelist(ante, 'antecedents', 'consequents')
nx.draw(G, with_labels = True, node_color = 'skyblue', pos = nx.spring_layout(G), node_size = 50)
plt.show()
Antecedents being bought: Counter({'Others, Life & Critical Illn, Joint': 1, 'Life & Critical Illn, Single, Joint': 1, 'Others, LASU, Single': 1, 'Life & Critical Illn, LASU, Single': 1, 'Others, Life & Critical Illn, LASU': 1, 'Others, Single, Joint': 1})
Consequents being bought: Counter({'LASU, Single': 1, 'Others, LASU': 1, 'Life & Critical Illn, Joint': 1, 'Others, Joint': 1, 'Single, Joint': 1, 'Life & Critical Illn, LASU': 1})